{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M6adb1j2jPI?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#@title\n",
    "from IPython.display import HTML\n",
    "\n",
    "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/M6adb1j2jPI?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers and Datasets libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install datasets transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]\n",
      "[1045, 5223, 2023, 1012]\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "sentences = [\n",
    "    \"I've been waiting for a HuggingFace course my whole life.\",\n",
    "    \"I hate this.\",\n",
    "]\n",
    "tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n",
    "ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n",
    "print(ids[0])\n",
    "print(ids[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "expected sequence of length 14 at dim 1 (got 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-2-b3483c81e2fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m        [1045, 5223, 2023, 1012]]\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0minput_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m: expected sequence of length 14 at dim 1 (got 4)"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
    "       [1045, 5223, 2023, 1012]]\n",
    "\n",
    "input_ids = torch.tensor(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
    "       [1045, 5223, 2023, 1012,    0,    0,    0,     0,     0,    0,    0,    0,    0,    0]]\n",
    "\n",
    "input_ids = torch.tensor(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "tokenizer.pad_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)\n",
      "tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n",
      "tensor([[-2.7276,  2.8789],\n",
      "        [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForSequenceClassification\n",
    "\n",
    "ids1 = torch.tensor(\n",
    "    [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]\n",
    ")\n",
    "ids2 = torch.tensor([[1045, 5223, 2023, 1012]])\n",
    "all_ids = torch.tensor(\n",
    "    [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
    "     [1045, 5223, 2023, 1012,    0,    0,    0,     0,     0,    0,    0,    0,    0,    0]]\n",
    ")\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "print(model(ids1).logits)\n",
    "print(model(ids2).logits)\n",
    "print(model(all_ids).logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_ids = torch.tensor(\n",
    "    [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
    "     [1045, 5223, 2023, 1012,    0,    0,    0,     0,     0,    0,    0,    0,    0,    0]]\n",
    ")\n",
    "attention_mask = torch.tensor(\n",
    "    [[   1,    1,    1,    1,    1,    1,    1,     1,     1,    1,    1,    1,    1,    1],\n",
    "     [   1,    1,    1,    1,    0,    0,    0,     0,     0,    0,    0,    0,    0,    0]]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)\n",
      "tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "output1 = model(ids1)\n",
    "output2 = model(ids2)\n",
    "print(output1.logits)\n",
    "print(output2.logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-2.7276,  2.8789],\n",
      "        [ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)\n"
     ]
    }
   ],
   "source": [
    "output = model(all_ids, attention_mask=attention_mask)\n",
    "print(output.logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "sentences = [\n",
    "    \"I've been waiting for a HuggingFace course my whole life.\",\n",
    "    \"I hate this.\",\n",
    "]\n",
    "print(tokenizer(sentences, padding=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "name": "Batching inputs together (PyTorch)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
